This document demonstrates the use of the weightedRF and weightedLASSO functions for integrative GRN inference.
It shows how to select one optimal alpha value per gene, in order to apply data integration only to the target genes for which expression prediction error is reduced by using PWM prior information, significantly more than in a baseline model.
Import of the expression data and the N-responsive genes and regulators :
load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426 45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426 201
pwm_imputed <- pwm_occurrence
pwm_imputed[is.na(pwm_imputed)] <- 0.5
ALPHAS <- seq(0,1, by = 0.1)
The following steps are time and CPU intensive, so the result files can just be loaded to be analysed in further steps.
Generating 100 repetitions of weightedRF MSE estimation on true data, and on shuffled data.
lmses <- data.frame(row.names = genes)
N <-100
for(alpha in ALPHAS){
for(perm in 1:N){
lmses[,paste(as.character(alpha), perm, "true_data")] <- weightedRF_inference_MSE(counts, genes, tfs, alpha = alpha, nTrees = 2000,
pwm_occurrence = pwm_occurrence, nCores = 45, tf_expression_permutation = F)
}
for(perm in 1:N){
lmses[,paste(as.character(alpha), perm, "shuffled")] <- weightedRF_inference_MSE(counts, genes, tfs, alpha = alpha, nTrees = 2000,
pwm_occurrence = pwm_occurrence, nCores = 45, tf_expression_permutation = T)
}
}
save(lmses, file = "results/brf_perumtations_mse_all_genes_predict.rdata")
subset<-unique(rownames(lmses))
Generating 100 repetitions of weightedLASSO MSE estimation on true data, and on shuffled data.
# lmses <- data.frame(row.names = genes)
# N<-50
# for(alpha in ALPHAS){
# for(perm in 1:N){
# lmses[,paste(as.character(alpha), perm, "true_data")] <- weightedLASSO_inference_MSE(counts, genes, tfs, alpha = alpha, N=50,
# pwm_occurrence = pwm_occurrence, nCores = 40, tf_expression_permutation = F)
#
# lmses[,paste(as.character(alpha), perm, "shuffled")] <- weightedLASSO_inference_MSE(counts, genes, tfs, alpha = alpha, N=50,
# pwm_occurrence = pwm_occurrence, nCores = 40, tf_expression_permutation = T)
# }
# }
# save(lmses, file = "results/lasso_perumtations_mse_all_genes_test.rdata")
nCores = 45
mats <- list()
nrep <- 100
for(alpha in 0.99){ # exploring PWM integration strength
for(rep in 1:nrep){ # exploring inherent variability
mat_rf <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
alpha = alpha,
pwm_occurrence = pwm_occurrence,
nCores = nCores,
importance = "%IncMSE")
mat_rf_perm <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
alpha = alpha, tf_expression_permutation = TRUE,
pwm_occurrence = pwm_occurrence,
nCores = nCores,
importance = "%IncMSE")
mats[[paste0("bRF_", as.character(alpha), '_trueData_', rep)]] <- mat_rf
mats[[paste0("bRF_", as.character(alpha), '_shuffled_', rep)]] <- mat_rf_perm
}
}
save(mats, file = "results/100_permutations_bRF_importances_inf.rdata")
# thresholds the regulatory weights at a certain density to build GRNs
edges <- list()
densities <- c(0.005, 0.01,0.05)
for(name in names(mats)){
for(density in densities){# exploring importance threshold stringency
edges[[paste0(name, '_', density)]] <-
weightedRF_network(mats[[name]], density = density, pwm_occurrence, genes, tfs)
}
}
save(edges, file = "results/100_permutations_bRF_edges_inf.rdata")
# validation of GRN edges against DAP-Seq
settings <- c("model", "alpha", "dataset", "rep", "density")
val_dap <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 35,
input_genes = genes,
input_tfs = tfs
)
val_dap[, settings] <-
str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_rf_validation_dap_inf.rdata")
nCores = 40
mats <- list()
nrep <- 50
for(alpha in ALPHAS){ # exploring PWM integration strength
for(rep in 1:nrep){ # exploring inherent variability
mat_lasso <- weightedLASSO_inference(counts, genes, tfs,
alpha = alpha, N = 10,
tf_expression_permutation = FALSE,
int_pwm_noise = 0, mda_type = "shuffle",
pwm_occurrence = pwm_occurrence,
nCores = nCores)
mat_lasso_perm <- weightedLASSO_inference(counts, genes, tfs,
alpha = alpha, N = 50,
tf_expression_permutation = TRUE,
int_pwm_noise = 0, mda_type = "shuffle",
pwm_occurrence = pwm_occurrence,
nCores = nCores)
mats[[paste0("LASSO_", as.character(alpha), '_trueData_', rep)]] <- mat_lasso
mats[[paste0("LASSO_", as.character(alpha), '_shuffled_', rep)]] <- mat_lasso_perm
}
}
save(mats, file = "results/100_permutations_lasso_mda_shuffle.rdata")
# thresholds the regulatory weights at certain densities to build GRNs
edges <- list()
lmses <- data.frame(row.names = genes)
densities <- c(0.005, 0.01,0.05)
for(name in names(mats)){
for(density in densities){# exploring importance threshold stringency
edges[[paste0(name, '_', density)]] <-
weightedLASSO_network(mats[[name]], density = density, pwm_occurrence,
genes, tfs, decreasing = TRUE)
lmses[,name] <- mats[[name]]["mse",]
}
}
save(edges, file = "results/100_permutations_lasso_edges.rdata")
save(lmses, file = "results/lasso_perumtations_mse_all_genes_test.rdata")
# validation of GRN edges against DAP-Seq
settings <- c("model", "alpha", "dataset", "rep", "density")
val_dap <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 35,
input_genes = genes,
input_tfs = tfs
)
val_dap[, settings] <-
str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_lasso_validation_dap.rdata")
# prior values
pwm_imputed <- pwm_occurrence
pwm_imputed[is.na(pwm_imputed)] <- 0.5
# parallel sapply to parallelise the computing of optimal alphas
mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) {
FUN <- match.fun(FUN)
answer <- parallel::mclapply(X = X, FUN = FUN, ...)
if (USE.NAMES && is.character(X) && is.null(names(answer)))
names(answer) <- X
if (!isFALSE(simplify) && length(answer))
simplify2array(answer, higher = (simplify == "array"))
else answer
}
# importance metrics and MSE for weightedRF
load("results/brf_perumtations_mse_all_genes_predict.rdata")
load("results/100_permutations_bRF_importances_inf.rdata")
# needs an appropriate mats variable and a lmses variable to be loaded
# plots the importance of TFs that have a specific prior value, here one,
# as alpha is increased
draw_gene_effective_integration <- function(gene, prior=1, type = "rank", without_1 = F){
tfs_with_motif <- names(which(pwm_imputed[gene,]== prior))
if(type == "rank")
data <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])}))
if(type == "imp")
data <- data.frame(lapply(mats, function(mat){mean(mat[,gene][tfs_with_motif])}))
data <- data %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_")
if(without_1)
data <- data %>%
filter(alpha != "1")
plot <- data %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance),
sd_imp = sd(summed_importance),
alpha = as.numeric(alpha))%>%
ggplot(aes(x=alpha, y = summed_importance, color = dataset, fill = dataset)) +
geom_point(alpha = 0.1) + geom_line(aes(y=mean_imp))+
geom_ribbon(aes(ymin = mean_imp - sd_imp ,
ymax = mean_imp + sd_imp ),
alpha = .4) +theme_pubr(legend = "none")+
ylab("Rank of TFs with Pi=1")+
ggtitle("Effective data integration") +
labs(subtitle = gene)+
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))
plot + xlab(expression(alpha))
}
# plots the MSE as alpha is increased
draw_gene_mse <- function(gene, title = NULL, without_1 = F){
data <-
lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("model", "alpha", "dataset", "rep"),
sep = "_")
if(without_1)
data <- data %>%
filter(alpha != "1")
data %>%
group_by(alpha, dataset) %>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T)) %>%
ggplot(aes(
x = as.numeric(alpha),
y = value,
color = dataset,
fill = dataset
)) +ggtitle(paste("MSE"))+ylab("MSE/Var(gene)")+
geom_ribbon(aes(ymin = mean_mse - sd_mse ,
ymax = mean_mse + sd_mse ),
alpha = .4) +theme_pubr(legend = "top")+
geom_point(alpha = 0.1) + geom_line(aes(y=mean_mse))+xlab(expression(alpha))+
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))
}
# plots the MSE as a function of effective data integration
get_opt_alpha_per_gene <- function(gene, type = "rank", return_cluster = F, without_1 = F){
tfs_with_motif <- names(which(pwm_occurrence[gene,]==1))
# gene with no TFBS has an optimal alpha of 0
if(return_cluster & length(tfs_with_motif)==0) return(0)
# else
if(length(tfs_with_motif)>0){
if(type == "rank")
data <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])}))
if(type == "imp")
data <- data.frame(lapply(mats, function(mat){mean(mat[,gene][tfs_with_motif])}))
inte <- data %>%
gather(key = "setting", value = "summed_importance") %>%
separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_imp = mean(summed_importance, na.rm=T),
sd_imp = sd(summed_importance, na.rm=T),
alpha = as.numeric(alpha))
curves <- lmses[gene, ] %>%
gather() %>%
separate(key,
into = c("model", "alpha", "dataset", "rep"),
sep = "_") %>%
group_by(alpha, dataset) %>%
mutate(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T)) %>%
mutate(alpha = as.numeric(alpha))%>%
full_join(inte, by = c("alpha", "dataset", "rep")) %>%
group_by(alpha, mean_imp, dataset) %>%
summarise(mean_mse = mean(value, na.rm = T),
sd_mse = sd(value, na.rm = T))
if(without_1)
curves <- curves %>%
filter(alpha != 1)
curves <- curves %>%
group_by(dataset) %>%
arrange(dataset, mean_imp)%>%
mutate(imps=curves[curves$dataset=="trueData", ]$mean_imp) %>%
mutate(approx_mse = approx(mean_imp,mean_mse,curves[curves$dataset=="trueData", ]$mean_imp, rule=2)$y,
approx_sd = approx(mean_imp,sd_mse,curves[curves$dataset=="trueData", ]$mean_imp, rule=2)$y)
true <- curves[curves$dataset=="trueData",]
shuff <- curves[curves$dataset!="trueData",]
# true$div <- (shuff$approx_mse - shuff$approx_sd) - (true$mean_mse)
true$div <- ifelse((shuff$approx_mse - true$mean_mse)/shuff$approx_sd>1,
(shuff$approx_mse - true$mean_mse)/shuff$approx_sd,
0)
if(max(true$div)>0) alpha_opt <- true[true$div == max(true$div),]$alpha
else alpha_opt <- 0
eff_opt = true[true$alpha==alpha_opt,]$mean_imp
padding = ifelse(type=="rank", ifelse(alpha_opt<0.2, 17, -17), 0)
if(return_cluster) return(alpha_opt)
curves%>%
ggplot(aes(y=mean_mse, x = mean_imp, color = dataset, fill = dataset))+
# geom_ribbon(aes(ymin =mean_mse-sd_mse ,
# ymax = mean_mse + sd_mse), alpha = .4)+
geom_ribbon(aes(x=imps,ymin =approx_mse-approx_sd ,
ymax = approx_mse + approx_sd), alpha = .25)+
geom_point(alpha = 0.1, size = 0.5) +
geom_line(aes(x=mean_imp, y = mean_mse), size=1) +
# geom_line(aes(x=imps, y = approx_mse), col="black")+
theme_pubr(legend = "none") +
ylab("MSE/Var(gene)") + xlab("Effective data integration") + ggtitle("MSE=f(effective integration)")+
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
geom_vline(xintercept = eff_opt, size = 2, col="#4670CD") +
annotate("text", x=eff_opt+padding, y=max(shuff$mean_mse),
label=paste("Optimal\nintegration:\nalpha =", alpha_opt),
angle=0, col = "#4670CD", size =3.5 )
}
else ggplot()
}
gene_no_benefit <- "AT1G30270"
gene_benefit_optimum <- "AT1G14720"
gene_benefit <- "AT2G24120"
n <- draw_gene_effective_integration(gene_no_benefit, prior = 1) +
draw_gene_mse(gene_no_benefit)+
get_opt_alpha_per_gene(gene_no_benefit)+
plot_annotation(title = gene_no_benefit)
o <- draw_gene_effective_integration(gene_benefit_optimum, prior = 1) +
draw_gene_mse(gene_benefit_optimum)+theme(legend.position = "none")+
get_opt_alpha_per_gene(gene_benefit_optimum)+
plot_annotation(title = gene_benefit_optimum)
p <- draw_gene_effective_integration(gene_benefit, prior = 1) +
draw_gene_mse(gene_benefit)+theme(legend.position = "none")+
get_opt_alpha_per_gene(gene_benefit)+
plot_annotation(title = gene_benefit)
figure <- n/o/p;figure
ggexport(figure, filename = "results/gene_examples_weightedRF.pdf", width = 10, height = 9)
# value of alpha per genes:
alphas_rf <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank",
return_cluster=T, mc.cores=34)
hist(alphas_rf)
save(alphas_rf, file = "results/alpha_per_gene_weighted_RF.rdata")
#turns old mse of RF into new format returned by inference
colnames(lmses) <-
paste0("RF", "_", str_split_fixed(colnames(lmses), ' ', 3)[,1], '_',
str_split_fixed(colnames(lmses), ' ', 3)[,3], "_",
str_split_fixed(colnames(lmses), ' ', 3)[,2]) %>%
str_replace("_data", "Data")
save(lmses, file = "results/brf_perumtations_mse_all_genes_predict.rdata")
Doing the same for weightedLASSO:
# for the lasso
load("results/lasso_perumtations_mse_all_genes_test.rdata")
load("results/100_permutations_lasso_mda_shuffle.rdata")
# load("results/100_permutations_lasso_edges.rdata")
# gene_no_benefit <- "AT5G66850"
# # the same as RFs
# gene_benefit_optimum <- "AT1G14720"
# gene_benefit <- "AT2G38180"
n <- draw_gene_effective_integration(gene_no_benefit, prior = 1) +
draw_gene_mse(gene_no_benefit)+
get_opt_alpha_per_gene(gene_no_benefit)+
plot_annotation(title = gene_no_benefit)
o <- draw_gene_effective_integration(gene_benefit_optimum, prior = 1) +
draw_gene_mse(gene_benefit_optimum)+theme(legend.position = "none")+
get_opt_alpha_per_gene(gene_benefit_optimum)+
plot_annotation(title = gene_benefit_optimum)
p <- draw_gene_effective_integration(gene_benefit, prior = 1) +
draw_gene_mse(gene_benefit)+theme(legend.position = "none")+
get_opt_alpha_per_gene(gene_benefit)+
plot_annotation(title = gene_benefit)
n/o/p
figure <- n/o/p
ggexport(figure, filename = "results/gene_examples_weightedLASSO.pdf", width = 10, height = 9)
#
# nrt <- sample(int, size = 1)
# draw_gene_effective_integration(nrt, prior = 1, type = "imp")
# draw_gene_effective_integration(nrt, prior = 0.5, type = "imp")
# draw_gene_effective_integration(nrt, prior = 0, type = "imp")
# get_opt_alpha_per_gene(nrt, type = "imp")
# value of alpha per genes:
alphas_lasso <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank",
return_cluster=T, mc.cores=34)
hist(alphas_lasso)
save(alphas_lasso, file = "results/alpha_per_gene_weighted_LASSO_test.rdata")
### test of specific alphas
# mat_lasso <- weightedLASSO_inference(counts, genes, tfs,
# alpha = alphas_lasso, N = 50,
# tf_expression_permutation = FALSE,
# int_pwm_noise = 0, mda_type = "shuffle",
# pwm_occurrence = pwm_occurrence,
# nCores = nCores)
#
# mat_lasso_perm <- weightedLASSO_inference(counts, genes, tfs,
# alpha = alphas_lasso, N = 50,
# tf_expression_permutation = TRUE,
# int_pwm_noise = 0, mda_type = "shuffle",
# pwm_occurrence = pwm_occurrence,
# nCores = nCores)
#
#
# grn <- weightedLASSO_network(mat_lasso, density = 0.005, pwm_occurrence,
# genes, tfs, decreasing = TRUE)
# grn_perm <- weightedLASSO_network(mat_lasso_perm, density = 0.005, pwm_occurrence,
# genes, tfs, decreasing = TRUE)
#
#
# median(mats$LASSO_1_trueData_1["mse",])
# median(mat_lasso["mse",])
# median(mat_lasso_perm["mse",])
# median(mats$LASSO_0_trueData_10["mse",])
#
# mean(mats$LASSO_1_trueData_1["mse",])
# mean(mat_lasso_perm["mse",])
# mean(mats$LASSO_0_trueData_10["mse",])
#
# mean(grn$pwm)
# evaluate_network(grn, genes, tfs, validation = "DAPSeq")[c("tpr", "recall")]
# evaluate_network(grn_perm, genes, tfs, validation = "DAPSeq")[c("tpr", "recall")]
With alpha = 1
lasso_1<- names(alphas_lasso[alphas_lasso==0.9])
subset <- sample(lasso_1, size = 8, replace = F)
wo1 <- F
for(nrt in subset ){
print(draw_gene_effective_integration(nrt, prior = 1, without_1 = wo1) +
draw_gene_mse(nrt, without_1 = wo1)+theme(legend.position = "none")+
get_opt_alpha_per_gene(nrt, without_1 = wo1)+
plot_annotation(title = nrt))
}
Without alpha = 1
wo1 <- T
for(nrt in subset ){
print(draw_gene_effective_integration(nrt, prior = 1, without_1 = wo1) +
draw_gene_mse(nrt, without_1 = wo1)+theme(legend.position = "none")+
get_opt_alpha_per_gene(nrt, without_1 = wo1)+
plot_annotation(title = nrt))
}
lasso_1<- names(alphas_lasso[alphas_lasso==1])
for(nrt in sample(lasso_1, size = 10, replace = F)){
print(draw_gene_effective_integration(nrt, prior = 1) +
draw_gene_mse(nrt)+theme(legend.position = "none")+
get_opt_alpha_per_gene(nrt)+
plot_annotation(title = nrt))
}
In alphas opt at 0.9
with alpha = 1
wo1 <-F
for(nrt in subset){
print(draw_gene_effective_integration(nrt, prior = 1, without_1 = wo1) + ggtitle("Pi=1, rank")+
draw_gene_effective_integration(nrt, prior = 1, type="imp", without_1 = wo1) +ggtitle("Pi=1")+
draw_gene_effective_integration(nrt, prior = 0.5, without_1 = wo1) +ggtitle("Pi=1/2, rank")+
draw_gene_effective_integration(nrt, prior = 0.5, type="imp", without_1 = wo1) +ggtitle("Pi=1/2")+
draw_gene_effective_integration(nrt, prior = 0, without_1 = wo1) +ggtitle("Pi=0, rank")+
draw_gene_effective_integration(nrt, prior = 0, type="imp", without_1 = wo1) +ggtitle("Pi=0")+
get_opt_alpha_per_gene(nrt, without_1 = wo1)+
get_opt_alpha_per_gene(nrt, type = "imp", without_1 = wo1)+
plot_annotation(title = nrt)+plot_layout(ncol=4))
}
Without alpha = 1
wo1 <- T
for(nrt in subset){
print(draw_gene_effective_integration(nrt, prior = 1, without_1 = wo1) + ggtitle("Pi=1, rank")+
draw_gene_effective_integration(nrt, prior = 1, type="imp", without_1 = wo1) +ggtitle("Pi=1")+
draw_gene_effective_integration(nrt, prior = 0.5, without_1 = wo1) +ggtitle("Pi=1/2, rank")+
draw_gene_effective_integration(nrt, prior = 0.5, type="imp", without_1 = wo1) +ggtitle("Pi=1/2")+
draw_gene_effective_integration(nrt, prior = 0, without_1 = wo1) +ggtitle("Pi=0, rank")+
draw_gene_effective_integration(nrt, prior = 0, type="imp", without_1 = wo1) +ggtitle("Pi=0")+
get_opt_alpha_per_gene(nrt, without_1 = wo1)+
get_opt_alpha_per_gene(nrt, type = "imp", without_1 = wo1)+
plot_annotation(title = nrt)+plot_layout(ncol=4))
}